This notebook brings the 2018 into alignment with the desired format with respect to field name, type, and grouping.
# Imports ----
import re
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import pickle
import tqdm
from g2fd.internal import *
#| default_exp internal
# 2018
year_string = '2018'
meta_path = './data/raw/GenomesToFields_G2F_Data_2018/e._2018_supplemental_info/g2f_2018_field_metadata.csv'
phno_path = './data/raw/GenomesToFields_G2F_Data_2018/a._2018_hybrid_phenotypic_data/g2f_2018_hybrid_data_clean.csv' # geno_path = None,
wthr_path = './data/raw/GenomesToFields_G2F_Data_2018/b._2018_weather_data/g2f_2018_weather_clean.csv'
soil_path = './data/raw/GenomesToFields_G2F_Data_2018/c._2018_soil_data/g2f_2018_soil_data.csv'
mgmt_path = './data/raw/GenomesToFields_G2F_Data_2018/e._2018_supplemental_info/g2f_2018_agronomic information.csv'
meta = pd.read_csv(meta_path, encoding = "ISO-8859-1", low_memory=False)
phno = pd.read_csv(phno_path, encoding = "ISO-8859-1", low_memory=False)
wthr = pd.read_csv(wthr_path, encoding = "ISO-8859-1", low_memory=False)
soil = pd.read_csv(soil_path, encoding = "ISO-8859-1", low_memory=False)
mgmt = pd.read_csv(mgmt_path, encoding = "ISO-8859-1", low_memory=False)
# load dicts for column renaming
meta_name_dict = mk_name_dict(name = 'meta')
phno_name_dict = mk_name_dict(name = 'phno')
soil_name_dict = mk_name_dict(name = 'soil')
wthr_name_dict = mk_name_dict(name = 'wthr')
mgmt_name_dict = mk_name_dict(name = 'mgmt')
Naming rules:
(find_unrecognized_columns(df = meta, dct = meta_name_dict),
find_unrecognized_columns(df = phno, dct = phno_name_dict),
find_unrecognized_columns(df = soil, dct = soil_name_dict),
find_unrecognized_columns(df = wthr, dct = wthr_name_dict),
find_unrecognized_columns(df = mgmt, dct = mgmt_name_dict))
([], [], [], [], [])
meta = meta.rename(columns=meta_name_dict)
phno = phno.rename(columns=phno_name_dict)
soil = soil.rename(columns=soil_name_dict)
wthr = wthr.rename(columns=wthr_name_dict)
mgmt = mgmt.rename(columns=mgmt_name_dict)
# add indicator columns to help with debugging merge
meta['meta'] = True
phno['phno'] = True
soil['soil'] = True
wthr['wthr'] = True
mgmt['mgmt'] = True
[e.shape for e in [meta, phno, soil, wthr, mgmt]]
[(30, 55), (24629, 41), (16, 29), (219510, 29), (149, 7)]
meta = sanitize_Experiment_Codes(
df = meta,
simple_renames = {
'MOH1- rep 1': 'MOH1-Rep1',
'MOH1- rep 2': 'MOH1-Rep2',
'TXH1- Dry': 'TXH1-Dry',
'TXH1- Early': 'TXH1-Early',
'TXH1- Late': 'TXH1-Late'
}, split_renames = {})
mgmt = sanitize_Experiment_Codes(
df = mgmt,
simple_renames = {
'MOH1- rep 1': 'MOH1-Rep1'
}, split_renames = {})
phno.columns
Index(['Experiment_Code', 'Drop_Record_Index', 'Source', 'Pedigree', 'Family',
'Replicate', 'Block', 'Plot', 'Range', 'Pass', 'Tester', 'Local_Check',
'Plot_Length_Unit_Feet', 'Alley_Length_Unit_Inches',
'Row_Spacing_Unit_Inches', 'Plot_Area_Unit_Feet2', 'Rows_Per_Plot',
'Packets_Per_Plot', 'Kernels_Per_Packet', 'Seeds_Per_Plot',
'Planted_Unit_Datetime', 'Harvested_Unit_Datetime',
'Anthesis_Unit_Datetime', 'Silking_Unit_Datetime', 'Anthesis_Unit_Days',
'Silking_Unit_Days', 'Plant_Height_Unit_cm', 'Ear_Height_Unit_cm',
'Stand_Count_Unit_Number', 'Stand_Count_Unit_Percent',
'Root_Lodging_Unit_Number', 'Stalk_Lodging_Unit_Number',
'Grain_Moisture_Unit_Percent', 'Test_Weight_Unit_lbs',
'Plot_Weight_Unit_lbs', 'Grain_Yield_Unit_bu_Per_A', 'Discarded',
'Phenotype_Comments', 'Filler', 'Additional_Metics', 'phno'],
dtype='object')
# confirm everything's okay
print(
'meta', find_unrecognized_experiments(meta.Experiment_Code, return_all_exps=False),
'\nphno', find_unrecognized_experiments(phno.Experiment_Code, return_all_exps=False),
'\nsoil', find_unrecognized_experiments(soil.Experiment_Code, return_all_exps=False),
'\nwthr', find_unrecognized_experiments(wthr.Experiment_Code, return_all_exps=False),
'\nmgmt', find_unrecognized_experiments(mgmt.Experiment_Code, return_all_exps=False),
'\nall ', find_unrecognized_experiments([], return_all_exps=True)
)
meta [] phno [] soil [] wthr [] mgmt [] all ['ARH1', 'ARH2', 'COH1', 'DEH1', 'GAH1', 'GAH2', 'GEH1', 'GEH2', 'IAH1', 'IAH2', 'IAH2 ', 'IAH3', 'IAH3 ', 'IAH4', 'IAH4 ', 'ILH1', 'INH1', 'KSH1', 'KSH2', 'KSH3', 'MIH1', 'MNH1', 'MOH1', 'MOH1 ', 'MOH1-Rep1', 'MOH1-Rep2', 'NCH1', 'NEH1', 'NEH2', 'NEH3', 'NYH1', 'NYH1', 'NYH2', 'NYH3', 'NYS1', 'OHH1', 'ONH1', 'ONH2', 'SCH1', 'TXH1', 'TXH1-Dry', 'TXH1-Early', 'TXH1-Late', 'TXH2', 'TXH3', 'TXH4', 'W1H1', 'W1H2', 'WIH1', 'WIH2', 'WIH3']
# Values in MNH1 have a duplicate row for each observation with one row of partially missing data
M = phno.loc[phno.Experiment_Code == "MNH1", ]
M2 = M.groupby(['Experiment_Code', 'Replicate', 'Block', 'Range', 'Pass', 'Plot',
'Family', 'Pedigree', 'Source']).agg(
Grain_Yield_Unit_bu_Per_A = ('Grain_Yield_Unit_bu_Per_A', np.mean),
diff = ('Grain_Yield_Unit_bu_Per_A', lambda x: np.nanmax(x) - np.nanmin(x))
).reset_index()
import plotly.express as px
px.scatter(M2, x = 'Grain_Yield_Unit_bu_Per_A', y='diff', color= 'diff')
/tmp/ipykernel_3887/937059992.py:8: RuntimeWarning: All-NaN axis encountered
diff = ('Grain_Yield_Unit_bu_Per_A', lambda x: np.nanmax(x) - np.nanmin(x))
M = phno.loc[(phno.Experiment_Code == 'MNH1' ), :]
# The solution to this is to add to the identifiers (these data are ostensibly coming from the same physical locations)
M_update_ids = M.sort_values(['Experiment_Code', 'Replicate', 'Block', 'Range', 'Pass', 'Plot'] # same as going off `Duplicate_Group`
).reset_index(
).drop(columns='index'
).reset_index(
).rename(columns = {'index':'Add_AB'})
M_update_ids.Add_AB = M_update_ids.Add_AB.mod(2)
M_update_ids.loc[M_update_ids.Add_AB == 0, 'Add_AB'] = 'A'
M_update_ids.loc[M_update_ids.Add_AB == 1, 'Add_AB'] = 'B'
M_update_ids.Replicate = M_update_ids.Replicate.astype('string') + M_update_ids.Add_AB
M_update_ids = M_update_ids.drop(columns='Add_AB')
M_update_ids.Replicate = M_update_ids.Replicate.astype('string')
phno.Replicate = phno.Replicate.astype('string')
shape_before_replacement = phno.shape
# now merge back in
mask = (phno.Experiment_Code == 'MNH1')
# shape should be the same
assert phno.loc[~mask, ].merge(M_update_ids, how = 'outer').shape == shape_before_replacement
phno = phno.loc[~mask, ].merge(M_update_ids, how = 'outer').copy()
# phno = phno.loc[~mask, ].copy()
# phno = phno.merge(M_update_ids, how = 'outer').copy()
# import seaborn as sns
# sns.heatmap(phno.isnull(), yticklabels=False, cbar=False, cmap = 'viridis')
# Find minimum cols needed to index all rows
df = phno#.loc[(phno.Experiment_Code == 'MNH1' ), :]
df = df.drop(columns='Drop_Record_Index').drop_duplicates()
id_cols = [#'Year',
'Experiment_Code', 'Replicate', 'Block', 'Range', 'Pass', 'Plot',]
# candidate_cols = [
# 'Experiment_Code',
# # 'Drop_Record_Index',
# 'Source', 'Pedigree', 'Family',
# 'Tester', 'Local_Check',
# 'Plot_Length_Unit_', 'Alley_Length_Unit_', 'Row_Spacing_Unit_',
# 'Plot_Area_Unit_', 'Rows_Per_Plot', 'Packets_Per_Plot',
# 'Kernels_Per_Packet', 'Seeds_Per_Plot', 'Planted_Unit_Datetime',
# 'Harvested_Unit_Datetime', 'Anthesis_Unit_Datetime',
# 'Silking_Unit_Datetime', 'Anthesis_Unit_Days', 'Silking_Unit_Days',
# 'Plant_Height_Unit_cm', 'Ear_Height_Unit_cm', 'Stand_Count_Unit_Number',
# 'Stand_Count_Unit_Percent', 'Root_Lodging_Unit_Number',
# 'Stalk_Lodging_Unit_Number', 'Grain_Moisture_Unit_Percent',
# 'Test_Weight_Unit_lbs', 'Plot_Weight_Unit_lbs',
# 'Grain_Yield_Unit_bu_Per_A', 'Discarded', 'Phenotype_Comments',
# 'Filler', 'Additional_Metics', 'phno'
# ]
# target = df.shape[0]
# output = pd.DataFrame(zip(
# candidate_cols,
# [df.loc[:, id_cols+[e]].drop_duplicates().shape[0] for e in candidate_cols]
# ), columns=['Additional_ID', 'Uniq_Vals'])
# output.assign(At_Target=lambda x:x.Uniq_Vals == target)
# test that all rows are uniquely indexed with the selected id columns
assert df.loc[:, id_cols].drop_duplicates().shape[0] == df.drop_duplicates().shape[0]
# separate static and dynamic values
sval = phno.merge(soil, how = 'outer')
meta.Alley_Length_Unit_Inches = meta.Alley_Length_Unit_Inches.astype(float) # must be converted for merging
sval = sval.merge(meta, how = 'outer')
# these tables are different enought we'll keep them separate
# unfortunately we need multiples because at least one field treats different passes differently
mgmt = phno.loc[:, ['Experiment_Code', 'Range', 'Pass', 'Plot', 'phno']
].drop_duplicates().merge(mgmt, how = 'outer')
mgmt = mgmt.loc[mgmt.mgmt.notna(), :].drop(columns = 'phno')
# confirm there are no rows in mgmt that are not in phno
# temp = mgmt.loc[(~mgmt.phno & mgmt.mgmt), :]
# if 0 != temp.shape[0]:
# print(temp)
# else:
# mgmt = mgmt.loc[mgmt.mgmt.notna(), :].drop(columns = 'phno')
# # wthr
# # There's only ever one weather station so we have to worry about imputation but not duplicates
# Set each id col to a string
for i in ['Experiment_Code', 'Range', 'Pass', 'Plot']:
sval[i] = sval[i].astype('string')
mgmt[i] = mgmt[i].astype('string')
if i not in ['Range', 'Pass', 'Plot']:
wthr[i] = wthr[i].astype('string')
The pattern to use is:
The main tasks that need to be completed are:
Identify values that can't be converted to the expected data type. The "find_unconvertable_" family of functions should be used.
find_unconvertable_datetimesFor simple renaming (e.g. misspellings) or splitting non-tidy data into two rows ("entry1-entry2" -> "entry1", "entry2") use sanitize_col
Move values that are ambigous but pertain to data imputation to "Imputation_Notes" using relocate_to_Imputation_Notes
If new columns need to be added (e.g. mgmt.Ingredient for parsed components of Product (e.g. elements) ) this should be accomplished with safe_create_col.
Any one off changes should be accomplised manually.
Confirm columns match the expected types with check_df_dtype_expectations, and report mismatches.
These steps should be completed for each dataframe in turn to minimize the cognitive load of the reader.
Note: to handle missing values some columns that would otherwise be ints are floats
sval_col_dtypes = mk_dtype_dict(name = 'sval')
wthr_col_dtypes = mk_dtype_dict(name = 'wthr')
mgmt_col_dtypes = mk_dtype_dict(name = 'mgmt')
# convert the date cols into datetime. Lean on pd.to_datetime() to infer the format, assume that each site uses the same format.
for e in ['Planted_Unit_Datetime',
'Harvested_Unit_Datetime',
'Anthesis_Unit_Datetime',
'Silking_Unit_Datetime',
'Recieved_Date_Unit_Datetime',
'Processed_Date_Unit_Datetime',
'Weather_Station_Placed_Unit_Datetime',
'Weather_Station_Removed_Unit_Datetime'
]:
# find_unconvertable_datetimes(df_col=sval[e], pattern='%Y-%m-%d %H:%M', index=False)
sval['Datetime_Temp'] = pd.to_datetime(np.nan)
for code in list(sval.Experiment_Code.drop_duplicates()):
# code = list(sval.Experiment_Code.drop_duplicates())[0]
sval.loc[sval.Experiment_Code == code, 'Datetime_Temp'
] = pd.to_datetime(sval.loc[sval.Experiment_Code == code, e])
sval.loc[:, e] = sval.loc[:, 'Datetime_Temp']
sval = sval.drop(columns = 'Datetime_Temp')
# -> floats
[find_unconvertable_numerics(df_col = sval[e], index = False) for e in [
'Alley_Length_Unit_Inches',
'Row_Spacing_Unit_Inches',
'Pounds_Needed_Soil_Moisture'
]]
sval = sanitize_col(
df = sval,
col = 'Pounds_Needed_Soil_Moisture',
simple_renames= {'56 lb/bu and adjusted to 15.5% moisture':'56',
'4 or 5': '4.5'},
split_renames= {})
# convert types
for e in ['Alley_Length_Unit_Inches', 'Row_Spacing_Unit_Inches', 'Pounds_Needed_Soil_Moisture',
'Anthesis_Unit_Days', 'Silking_Unit_Days', 'Kernels_Per_Plot']:
err_list = find_unconvertable_numerics(df_col = sval[e], index = False)
if err_list != []:
print(e)
print(err_list)
else:
sval[e] = sval[e].astype('float')
# to bool
# set missing to false
sval.loc[sval.Discarded.isna(), 'Discarded'] = 'False'
sval.Discarded = sval.Discarded.map({'True': True, 'False': False})
# # to float
# sval.Pounds_Needed_Soil_Moisture.astype(float)
# to bool
sval['phno'] = sval['phno'].astype('bool')
sval['soil'] = sval['soil'].astype('bool')
sval['meta'] = sval['meta'].astype('bool')
sval['Discarded'] = sval['Discarded'].astype('bool')
# to string
sval = cols_astype_string(
df = sval,
col_list = [key for key in sval_col_dtypes.keys() if sval_col_dtypes[key] == 'string'])
# sval.Year = year_string
# sval.Year = sval.Year.astype('string')
sval = sval.drop(columns=[
'Drop_Record_Index',
'Additional_Metics' # 0 or NaN
])
checkpoint = check_df_dtype_expectations(df = sval, dtype_dct = sval_col_dtypes)
if sum(checkpoint.Pass)/checkpoint.shape[0] == 1:
pass
else:
print(checkpoint.loc[~checkpoint.Pass, ])
118/118 Columns pass.
# instead of writing regexes to figure out the mose likely format for each datetime, we assume each experiment will be consistent withing that experiment
# and let pd figure it out.
wthr['Datetime_Temp'] = pd.to_datetime(np.nan)
wthr['Datetime'] = pd.to_datetime(np.nan) # no Datetime col
wthr['Datetime_Temp'] = wthr['Year'].astype('string')+'-'+wthr['Month'].astype('string')+'-'+wthr['Day'].astype('string')+' '+wthr['Time'].astype('string')
# convert types
err_list = find_unconvertable_datetimes(df_col=wthr['Datetime_Temp'], pattern='%Y-%m-%d %H:%M', index=False)
if err_list != []:
print(err_list)
else:
wthr.Datetime_Temp = pd.to_datetime(pd.Series(wthr.Datetime_Temp), errors='coerce')
wthr.Datetime = wthr.Datetime_Temp
wthr = wthr.drop(columns= 'Datetime_Temp')
mask = (wthr.Rainfall_Unit_mm == 'T')
wthr.loc[mask, 'Rainfall_Unit_mm'] = np.nan
wthr.Rainfall_Unit_mm = wthr.Rainfall_Unit_mm.astype(float)
# to bool
wthr = sanitize_col(
df = wthr,
col = 'Data_Cleaned',
simple_renames= {
'Yes':'True',
'No':'False'},
split_renames= {})
# set missing to false
wthr.loc[wthr.Data_Cleaned.isna(), 'Data_Cleaned'] = 'False'
wthr.Data_Cleaned = wthr.Data_Cleaned.map({'True': True, 'False': False})
# wthr.loc[:, 'Data_Cleaned'].drop_duplicates()
# to string
wthr = cols_astype_string(
df = wthr,
col_list = [key for key in wthr_col_dtypes.keys() if wthr_col_dtypes[key] == 'string'])
wthr.Year = year_string
wthr.Year = wthr.Year.astype('string')
wthr = wthr.drop(columns = ['Drop_Record_Index',
'Data_Cleaned', # 0 non-null
'Soil_EC_Unit_mS_per_cm' # 0 non-null
])
checkpoint = check_df_dtype_expectations(df = wthr, dtype_dct = wthr_col_dtypes)
if sum(checkpoint.Pass)/checkpoint.shape[0] == 1:
pass
else:
print(checkpoint.loc[~checkpoint.Pass, ])
27/27 Columns pass.
mgmt['Datetime'] = pd.to_datetime(np.nan)
mask = find_unconvertable_datetimes(df_col=mgmt.Date_Datetime, pattern='%m/%d/%y', index=True)
mgmt.loc[mask, 'Datetime'] = mgmt.loc[mask, 'Date_Datetime']
mgmt['Datetime'] = pd.to_datetime(mgmt['Datetime'])
mgmt.loc[~mask, 'Datetime'] = mgmt.loc[~mask, 'Date_Datetime']
mgmt['Datetime'] = pd.to_datetime(mgmt['Datetime'])
mgmt['Date_Datetime'] = mgmt['Datetime']
mgmt = mgmt.drop(columns=['Datetime'])
mgmt = mgmt.copy()
# convert types
err_list = find_unconvertable_datetimes(df_col=mgmt.Date_Datetime, pattern='%m/%d/%y', index=False)
if err_list != []:
print(err_list)
else:
mgmt.Date_Datetime = pd.to_datetime(pd.Series(mgmt.Date_Datetime), format = '%m/%d/%y', errors='coerce')
mask = (mgmt.Amount_Per_Acre == '1.6 qt')
mgmt.loc[mask, ['Amount_Per_Acre', 'Unit']] = ['1.6', 'qt']
mask = (mgmt.Amount_Per_Acre == '1pt')
mgmt.loc[mask, ['Amount_Per_Acre', 'Unit']] = ['1', 'pt']
mask = (mgmt.Amount_Per_Acre == '2pt')
mgmt.loc[mask, ['Amount_Per_Acre', 'Unit']] = ['2', 'pt']
mask = (mgmt.Amount_Per_Acre == '2 qt')
mgmt.loc[mask, ['Amount_Per_Acre', 'Unit']] = ['2', 'qt']
mgmt = sanitize_col(
df = mgmt,
col = 'Amount_Per_Acre',
simple_renames= {'180 N': '180 (N)',
'130 (42 GPA)': '130',
'120-0-0': '120 (N)',
'NPK 0-0-39': '39 (K)'},
split_renames= {'NPK 5-25-0': ['5 (N)', '25 (P)'],
'27N, 26P': ['27 (N)', '26 (P)'],
'21/48': ['21 (Outlook)', '48, (Infantry)'],
'3/32/2.5%/1%': ['3 (Calisto)', '32 (Atrazine)', '0.025 (AMS)', '0.01 (COC)'],
'0-100-100': ['100 (P)', '100 (K)']})
mgmt = safe_create_col(mgmt, "Ingredient")
mask = mgmt.Ingredient.isna()
mgmt.loc[mask, 'Ingredient'] = mgmt.loc[mask, 'Product']
# assume each string is formated as 'val (key)'. `sanitize_col` should be used to enforce this.
for e in ['0.025 (AMS)', '0.01 (COC)', '25 (P)', '3 (Calisto)', '180 (N)', '120 (N)', '27 (N)', '26 (P)', '100 (P)', '21 (Outlook)', '100 (K)', '32 (Atrazine)', '39 (K)', '5 (N)', '48, (Infantry)']:
val = re.findall('^\d+[.]*\d*', e)[0]
key = re.findall('\(.+\)', e)[0].replace('(', '').replace(')', '')
mask = (mgmt['Amount_Per_Acre'] == e)
mgmt.loc[mask, 'Ingredient'] = key
mgmt.loc[mask, 'Amount_Per_Acre'] = val
# convert types
err_list = find_unconvertable_numerics(df_col = mgmt['Amount_Per_Acre'], index = False)
if err_list != []:
print(err_list)
else:
mgmt.Amount_Per_Acre = pd.to_numeric(mgmt.Amount_Per_Acre, errors='coerce')
This is to be the cleaned up version of the "Product" column
# list(mgmt.loc[:, 'Ingredient'].drop_duplicates())
# to bool
mgmt['mgmt'] = mgmt['mgmt'].astype('bool')
# to string
for e in [ee for ee in ['Application', 'Product', 'Ingredient', 'Unit', 'Imputation_Notes'] if ee in mgmt.columns]:
mgmt[e] = mgmt[e].astype('string')
# mgmt.Year = year_string
# mgmt.Year = mgmt.Year.astype('string')
check_df_dtype_expectations(df = mgmt, dtype_dct = mgmt_col_dtypes)
11/11 Columns pass.
| Column | dtype | Expected_dtype | Pass | |
|---|---|---|---|---|
| 0 | Experiment_Code | string | string | True |
| 1 | Range | string | string | True |
| 2 | Pass | string | string | True |
| 3 | Plot | string | string | True |
| 4 | Application | string | string | True |
| 5 | Product | string | string | True |
| 6 | Date_Datetime | datetime64[ns] | datetime64[ns] | True |
| 7 | Amount_Per_Acre | float64 | float64 | True |
| 8 | Unit | string | string | True |
| 9 | mgmt | bool | bool | True |
| 10 | Ingredient | string | string | True |
write_out_pkl(obj = sval, path = './data/interim/'+year_string+'sval.pickle')
write_out_pkl(obj = wthr, path = './data/interim/'+year_string+'wthr.pickle')
write_out_pkl(obj = mgmt, path = './data/interim/'+year_string+'mgmt.pickle')